# topic 24 # This will be remarkably similar to the work in # topics 21 and 22, so much so that it is worth examining the # two to see the changes. # First, set up the situation. We have two populations, # each having a certain proportion if items with the # characteristic "2" # source("../gnrnd5.R") source("../gnrnd4.R") gnrnd5( 56823499907, 475685) pop_one <- L1 # look at the first 100 items in pop_one head( pop_one, 100) gnrnd5( 77843499907, 847655) pop_two <- L1 # look at the first 100 items in pop_two head( pop_two, 100) ################################################ ### Confidence Interval ### ################################################ # We want to construct a 95% confidence interval # for the difference of the proportion of 2's, p_one - p_two. # To do this we will take random samples of both, # the size of the sample from pop_one will be 94 # and the size of the sample from pop_two will # be 87. n1 <- 94 n2 <- 87 # for now, we will get the samples by generating the # appropriate random index numbers for the two # populations. # get the sample and sample statistics from pop_one gnrnd4( 982049301, 500000001) L1 index_one <- L1 samp_one <- pop_one[ index_one ] samp_one # now find the number of 2's in that sample freqs_one <- table( samp_one ) freqs_one phat_one <- freqs_one[2]/n1 names( phat_one)<-"phat_one" phat_one # get the sample and sample statistics from pop_two gnrnd4( 387068601, 500000001) L1 index_two <- L1 samp_two <- pop_two[ index_two ] samp_two # now find the number of 2's in that sample freqs_two <- table( samp_two ) freqs_two phat_two <- freqs_two[2]/n2 names( phat_two)<-"phat_two" phat_two #### Therefore we can find the difference of the sample # proportions diff_of_props <- phat_one - phat_two diff_of_props #### We want to use the normal approximation for the distribution #### of the difference of the sample proportions. To do this #### we want to be sure that we are not sampling more than #### 5% of the population and that we have at least 10 successes #### (items with the characteristic) and 10 failures (items #### without the characteristic) in our samples. We meet #### both conditions. #### The the standard deviation of the difference of the two ####sample proportions is # std_err <- sqrt( phat_one*(1-phat_one)/n1 + phat_two*(1-phat_two)/n2) std_err #### for a 95% confidence interval we need the z-value that #### has 2.5% of the area under the standard normal curve #### above that value # z_val <- qnorm( 0.025, lower.tail=FALSE) z_val # # so our 95% confidence interval is low_end <- diff_of_props - z_val*std_err low_end # and high_end <- diff_of_props + z_val*std_err high_end # # That pattern of commands would be the same for each # instance of such a problem. We have a function # that does this. It is called ci_2popproportion(). source("../ci_2popproport.R") ci_2popproportion( n1, freqs_one[2], n2, freqs_two[2], 0.95) # Now, it turns out that we could actually find the # proportion of 2's in each population and, # therefore, find the true difference in # the proportions # # find the number of 2's in the first population pop_freqs_1 <- table( pop_one ) pop_freqs_1 proport_one <- pop_freqs_1[2] / 5000 proport_one # # find the number of 2's in the second population pop_freqs_2 <- table( pop_two ) pop_freqs_2 proport_two <- pop_freqs_2[2] / 5000 proport_two # so the true difference is true_diff <- proport_one - proport_two true_diff # and that is indeed inside our 95% confidence # interval. # # Of course, that was for just one set of samples. # We will do this same thing for 10000 samples and # see how many confidence intervals contain the # true difference in proportions L3 <- 1:10000 for (i in 1:10000) { samp_one <- sample( pop_one, 94) freqs_one <- table(samp_one) samp_two <- sample( pop_two, 87) freqs_two <- table( samp_two) this_ci <- ci_2popproportion(94, freqs_one[2], 87, freqs_two[2], 0.95) if( this_ci[1] <= true_diff & true_diff <= this_ci[2] ) { L3[i] = "hit"} else { L3[i] = "missed"} } # see how we did table( L3 ) ################################################ ### Hypothesis test ### ################################################ # Now, someone says that they believe that the # difference of the proportions of 2's # from pop_one and pop_two, # phat_1 - phat_2 = 0. That is, the proportions are the same. # Thus our null hypothesis, H0, is that the difference # is 0. # Our alternative hypothesis, H1, is that the # difference of the proportions is greater than 0, or # that phat_one > phat_two. # We want to see if we can reject H0 in favor of H1. # We will get a sample from each # population, n1=94 and n2=87. # We are willing to be wrong in telling the # person that they are wrong 2.5% of the time! ######################## ## The critical value approach. We know that two ## samples of size 94 and 87, respectively. ## The distribution of the difference in the sample ## proportions will be normal, with the mean being ## the true difference in proportions in the ## two populations. Using our null hypothesis ## that difference should be 0. However, we need ## to find the standard deviation of the distribution ## of our difference in the sample proportions. ## To do this we will use our pooled proportion. # # Let us go back and take our original samples again # get the sample and sample statistics from pop_one gnrnd4( 982049301, 500000001) L1 index_one <- L1 samp_one <- pop_one[ index_one ] samp_one # now find the number of 2's in that sample freqs_one <- table( samp_one ) freqs_one freqs_one[2] # get the sample and sample statistics from pop_two gnrnd4( 387068601, 500000001) L1 index_two <- L1 samp_two <- pop_two[ index_two ] samp_two # now find the number of 2's in that sample freqs_two <- table( samp_two ) freqs_two freqs_two[2] # # then our pooled proportion is pooled_prop <- ( freqs_one[2]+freqs_two[2])/(n1+n2) pooled_prop # therefore we can compute the desired standard error std_err <- sqrt(pooled_prop*(1-pooled_prop)*(1/n1+1/n2)) std_err # # Our desired level of significance is 2.5%. # Also, our alternative hypothesis was the one-sided # test, greater than. # Therefore, our critical high value will be our # H0 + z_0.025*std_err # Find our z_0.025 z_high <- qnorm( 0.025, lower.tail=FALSE ) z_high # Since our H0 is 0 this means that our critical high # value is 0 + z_high*std_err # the critical high value # ###################### ## The attained significance approach asks, if H0 is true, ## then how strange would it be to get a difference of the ## proportions at this value or at any stranger value. But ## that is just a pnorm command. this_diff <- freqs_one[2]/n1 - freqs_two[2]/n2 this_diff pnorm( this_diff, mean=0, sd=std_err, lower.tail=FALSE) # # That attained significance is not less than our given # level of significance, 0.025, so we would not reject H0 # in favor of H1. ######################### ## Of course, for any similar problem we would be doing ## the same steps, with the additional complication of ## having to pay attention to the alternative. ## ## We might as well capture ## all of this in a function, hypoth_2test_prop. ## Let us load and run that for the values ## in our example. source("../hypo_2popproport.R") hypoth_2test_prop(freqs_one[2],n1, freqs_two[2],n2, 1, 0.025) ###################################################### ###################################################### ###################################### ### Now let us go back to our hypothesis test. We ### know that the true difference of these means is ## not zero. In fact, we know the real difference true_diff # # That is a difference but not a very large one. # In fact, that difference is so small that # it will be really hard to find two samples that # would allow us to reject the null hypothesis that # the proportions are the same. We will try to take # two samples 1000 times and see how often we reject # the null hypothesis. We will make that a bit easier # to do by changing the alternative to a less than; # after all, we know that the proportion in the # first population is indeed less than the proportion # in the second population. # L3 <- 1:1000 for (i in 1:1000) { samp_one <- sample( pop_one, 94) freqs_one <- table(samp_one) samp_two <- sample( pop_two, 87) freqs_two <- table( samp_two) this_test <- hypoth_2test_prop(freqs_one[2],n1, freqs_two[2],n2, -1, 0.025) L3[i] <- "do not reject" if( this_test[15] == "Reject" ) { L3[i] <- "reject" } } # see how we did table( L3 ) # # We will change the second population so that # it has a higher proportion of 2's gnrnd5( 77843499907, 547955) pop_two <- L1 # find the number of 2's in the second population pop_freqs_2 <- table( pop_two ) pop_freqs_2 proport_two <- pop_freqs_2[2] / 5000 proport_two # so the true difference is true_diff <- proport_one - proport_two true_diff L3 <- 1:1000 for (i in 1:1000) { samp_one <- sample( pop_one, 94) freqs_one <- table(samp_one) samp_two <- sample( pop_two, 87) freqs_two <- table( samp_two) this_test <- hypoth_2test_prop(freqs_one[2],n1, freqs_two[2],n2, -1, 0.025) L3[i] <- "do not reject" if( this_test[15] == "Reject" ) { L3[i] <- "reject" } } # see how we did table( L3 )